{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '1'\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "template = {\n",
    "    \"description\": \"Template used by Alpaca-LoRA.\",\n",
    "    \"prompt_input\": \"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\\n\\n### Instruction:\\n{instruction}\\n\\n### Input:\\n{input}\\n\\n### Response:\\n\",\n",
    "    \"prompt_no_input\": \"Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\n{instruction}\\n\\n### Response:\\n\",\n",
    "    \"response_split\": \"### Response:\"    \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import malaya\n",
    "\n",
    "model = malaya.augmentation.abstractive.huggingface(model = 'mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v3', force_check = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "_ = model.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['macam mana aku nak kurangkan berat badan tak pergi gym',\n",
       " 'macam mana saya boleh kurangkan berat badan tanpa pergi gym',\n",
       " 'macam mana aku boleh kurangkan berat badan aku tak gi gym',\n",
       " 'mcm mana aku boleh kurangkan badan tanpa pergi gym',\n",
       " 'mcm mana aku nak kurangkan berat badan tak pegi gym']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.generate(['bagaimana saya boleh kurangkan berat badan tanpa pergi ke gim?'],\n",
    "              max_length = 100, do_sample=True, top_k=100, top_p=0.95, temperature=0.7,\n",
    "              num_return_sequences=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "18755it [05:47, 18.74it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "47997it [07:51, 901.41it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "67064it [16:11, 231.22it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "68779it [16:22, 114.40it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "69670it [16:27, 183.02it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "69971it [16:29, 153.10it/s]IOPub message rate exceeded.\n",
      "The notebook server will temporarily stop sending output\n",
      "to the client in order to avoid crashing it.\n",
      "To change this limit, set the config variable\n",
      "`--NotebookApp.iopub_msg_rate_limit`.\n",
      "\n",
      "Current values:\n",
      "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
      "NotebookApp.rate_limit_window=3.0 (secs)\n",
      "\n",
      "354755it [1:33:28, 36.40it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "357079it [1:34:28, 40.04it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n",
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "359327it [1:35:25, 44.19it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "360087it [1:35:45, 62.70it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "360109it [1:35:45, 44.85it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n",
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "360392it [1:35:52, 49.72it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "360407it [1:35:53, 50.81it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "361581it [1:36:22, 52.21it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "363735it [1:37:17, 41.48it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "365229it [1:37:52, 50.31it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "365235it [1:37:52, 40.57it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n",
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "365577it [1:38:00, 45.30it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "366771it [1:38:31, 39.54it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "369145it [1:39:32, 42.44it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "369728it [1:39:47, 32.14it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "369757it [1:39:48, 31.66it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "371816it [1:40:41, 36.47it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "371972it [1:40:46, 38.08it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "372675it [1:41:03, 47.54it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "378839it [1:43:00, 742.89it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n",
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "379541it [1:43:01, 708.87it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "380035it [1:43:05, 116.38it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "398652it [1:49:41, 117.58it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "401904it [1:50:13, 1436.53it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "402907it [1:50:44, 37.81it/s]  "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "421898it [1:59:29, 48.42it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "422228it [1:59:38, 49.55it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n",
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "422910it [1:59:52, 30.28it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "423630it [2:00:12, 32.76it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "427431it [2:01:48, 30.53it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n",
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "428470it [2:02:18, 29.39it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "429223it [2:02:39, 30.54it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "429937it [2:02:57, 40.72it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "430618it [2:03:16, 39.87it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "431361it [2:03:36, 25.03it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "431903it [2:03:50, 33.62it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "432099it [2:03:54, 66.41it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "432844it [2:04:11, 45.17it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "433567it [2:04:30, 58.10it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "435732it [2:05:22, 74.75it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object is not subscriptable\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "437647it [2:06:12, 30.33it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "438533it [2:06:36, 50.99it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'result'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "439509it [2:07:01, 57.67it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "data = []\n",
    "with open('/home/husein/ssd3/gpt4all-1.3/translated-gpt4all-filtered-noncode.jsonl') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        l = json.loads(l)\n",
    "        try:\n",
    "            a = l['r'][-1]['result']\n",
    "            ori_q = l['src']['prompt']\n",
    "            \n",
    "            if random.random() > 0.7:\n",
    "                q = ori_q\n",
    "            else:\n",
    "                q = l['r'][0]['result']\n",
    "                if random.random() > 0.5 and '|' not in ori_q and len(q.split()) < 20:\n",
    "                    g = model.generate([q],\n",
    "                      max_length = 256, do_sample=True, top_k=100, top_p=0.95, temperature=0.7,\n",
    "                      num_return_sequences=5)\n",
    "                    q = random.choice(g)\n",
    "            \n",
    "            res = template[\"prompt_no_input\"].format(instruction=q)\n",
    "            res = f\"{res}{a}\"\n",
    "            data.append(res)\n",
    "            \n",
    "        except Exception as e:\n",
    "            print(e)\n",
    "            pass\n",
    "\n",
    "# data = []\n",
    "# for d in tqdm(gpt4all):\n",
    "#     if 'prompt' not in d or 'response' not in d:\n",
    "#         continue\n",
    "    \n",
    "#     try:\n",
    "#         if random.random() > 0.5:\n",
    "#             q = d['prompt'][0] + random.choice(malay_respond)\n",
    "#         else:\n",
    "#             q = d['prompt'][1]\n",
    "#             if random.random() > 0.5 and len(q.split()) < 20:\n",
    "#                 generated = model.generate(\n",
    "#                     [q],\n",
    "#                     max_length = 100, \n",
    "#                     do_sample=True, top_k=100, top_p=0.95, temperature=0.7,\n",
    "#                     num_return_sequences=5,\n",
    "#                 )\n",
    "#                 q = random.choice(generated)\n",
    "            \n",
    "#         if random.random() > 0.5:\n",
    "#             q = q.lower()\n",
    "\n",
    "#         a = d['response'][-1]\n",
    "#         res = template[\"prompt_no_input\"].format(instruction=q)\n",
    "#         res = f\"{res}{a}\"\n",
    "#         data.append(res)\n",
    "#     except:\n",
    "#         pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Below is an instruction that describes a task. Write a response that appropriately completes the request.\\n\\n### Instruction:\\nJika mengambil masa 20 minit untuk berjalan kaki dari sekolah ke perhentian bas dan bas terakhir ialah 10 minit selepas kelas saya tamat, adakah saya perlu meninggalkan kelas 10 minit lebih awal?\\n\\n### Response:\\nYa, adalah bijak untuk meninggalkan kelas 10 minit lebih awal untuk memastikan anda sampai ke perhentian bas tepat pada masanya untuk menaiki bas terakhir anda.'"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[-2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "427276"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('prepared-gpt4all-v1.3.json', 'w') as fopen:\n",
    "    json.dump(data, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
